library(ggplot2)
library(directlabels)
library(reshape)
library(plyr)
library(gcookbook)
library(plm)
library(multiwayvcov)
library(stargazer)
library(data.table)
###################### Functions #################
plot.all_manual <- function(data,  df_label, figurePath = "", x = "", y = "", groupBy = "exchange", xlab = "Depth Share", ylab = "Volume Share", title = "", lim = c(0,1),
                            lineColor = c("black","black"), lineSize = c(0.7,0.7), lineType = c("solid","solid"), addReg = FALSE, regCoef = c(1,1),
                            regLabels = c("45-Degree Lie","Regression"), R2_value = 0.99, exToExclude = c(),
                            axisSize = 22, exLabelSize = 5){
  
  # We count the number of observations where the depth and volume are = 0
  n <- length(data[(data[,x] == 0 & data[,y] == 0),][,x])
  if(n > 0){
    # If there are observations with depth and volume = 0, we set depth to NA.
    # Such an observation won't be included in the regression.
    data[(data[,x] == 0 & data[,y] == 0),][,x] <- NA
    if(length(exToExclude) > 0){
      # If there are exchanges that are listed as excluded, set their depth and volumes to NA
      data[data$EX %in% exToExclude,][,x] <- NA
      data[data$EX %in% exToExclude,][,y] <- NA
    }
  }
  
  g <- ggplot(data, aes_string(x = x, y = y, color = groupBy)) + geom_point(size = 2) + labs(title = paste(title), x = xlab, y = ylab) +
    theme(axis.text.x = element_text(size = axisSize), axis.text.y = element_text(size = axisSize), axis.title.x = element_text(size = axisSize), 
          axis.title.y = element_text(size = axisSize), title = element_text(size = 27), legend.position="none") +
    scale_x_continuous(breaks = seq(0,1,0.2), limits = lim) + scale_y_continuous(breaks = seq(0,1,0.2), limits = lim) +
    geom_text(data = df_label, aes(x = x, y = y, label = label, color = factor(label)), size = exLabelSize) +
    geom_abline(intercept = 0, slope = regCoef[1], size = lineSize[1], linetype = lineType[1], color = lineColor[1]) 
  
  ggsave(file=figurePath, plot = g, width = 12, height= 8.8, dpi=120)
  
}

exchange_code = c("A", "B", "C", "D", "I", "J", "K", "M", "N", "T", "P",
                  "S", "T/Q/O/1", "Q", "W", "X", "Y", "Z", "date", "X_1", "O", '8', '1')
exchange_name = c("NYSE MKT LLC","NASDAQ OMX BX, Inc.","National Stock Exchange, Inc.", "FINRA", "ISE Stock Exchange LLC",
                  "EDGA Exchange, Inc.","EDGX Exchange, Inc.","Chicago Stock Exchange, Inc.",
                  "New York Stock Exchange LLC","The Nasdaq Stock Market LLC","NYSE Arca, Inc.",
                  "Consolidated.Tape.System","The Nasdaq Stock Market LLC", "NASDAQ_Q", 
                  "CBOE Stock Exchange LLC", "NASDAQ OMX PSX LLC", "BATS Y-Exchange, Inc.", 
                  "BATS Exchange, Inc.", "date", "UNKNOWN_X_1", "NASDAQ_O", "UNKNOWN_8", "NASDAQ_1")
exchange_name_st = c("NYSE MKT","NASDAQ BX","NSX", "FINRA", "ISE",
                     "EDGA","EDGX","CHX",
                     "NYSE LLC","NASDAQ LLC","NYSE Arca",
                     "Consolidated.Tape.System","NASDAQ LLC", "NASDAQ_Q", 
                     "CBOE", "NASDAQ PSX", "BATS Y", 
                     "BATS", "date", "UNKNOWN_X_1", "NASDAQ_O", "UNKNOWN_8", "NASDAQ_1")

run.regression <- function(data, model, output = "coef", var = "", x = "depthShare", y = "volumeShare") {
  
  data <- data[!is.na(data[,x]) & !is.na(data[,y]), ]
  fit <- lm(model, data, na.action = na.omit)
  if(output == "coef"){
    return(fit$coef[var])
  }
  else if(output == "R2"){
    # Calculate R2 - We compute the R2 manually, because of the way R computes R2 values in a regression 
    # when the intercept is suppressed. We checked that the intercept is indeed suppressed here.
    # The current method uses SStot =  sum(sijt − si)^2, but R will use SStot =  sum(sijt − 0)^2.
    sm <- summary(fit)
    resid <- sm$residuals
    dif_mean <- data[,y] - mean(data[,y])
    R2 = 1 - sum(resid^2)/sum(dif_mean^2)
    return(R2)
  }
  else if(output == "MSE"){
    sm <- summary(fit)
    return(sqrt(mean(na.omit(sm$residuals^2))))
  }
  else{
    sm <- summary(fit)
    return(sm$r.squared)
  }
  
}

calculate.45degreeR2 <- function(data, type = "", x = '', y = ''){
  
  data <- data[!is.na(data[,x]) & !is.na(data[,y]), ]
  
  # Find the R^2 if you fix the relationship to be a=0 and b=1, 
  # that is, the 45 degree line explains what proportion of the variation.
  data$resid <- data[,y] - data[,x]
  data$dif_mean <- (data[,y] - mean(data[,y]))
  
  R2_45 <- 1 - sum(data$resid^2)/sum(data$dif_mean^2)
  
  if(type == "fe"){
    fit_45_fe <- lm(resid ~ 0 + isTaker + isOther, data = data, na.action = na.omit)
    R2_45_fe <- 1 - sum(fit_45_fe$resid^2)/sum(data$dif_mean^2)
    return(R2_45_fe)
  }
  else if(type == "mse"){
    return(sqrt(mean(data$resid^2)))
  }
  else{
    return(R2_45)
  }
  
}

output.regressions_vf_perSpec <- function(list_df, list_xVars, list_yVars, list_specifications, digits = 3) {
  
  for(i in seq(1,length(list_df))){
    df <- data.frame(list_df[[i]])
    x <- list_xVars[[i]]
    y <- list_yVars[[i]]
    specif <- list_specifications[[i]]
    
    # Suppress the intercept
    fm <- as.formula(paste0(y,"~ 0 + ",x))
    
    # Removes observations where both depth and volume = 0.
    df_noZeros <- df[!(df[,x] == 0 & df[,y] == 0),]
    # df_noZeros <- df[!(df[,'f_NYSE_Listed'] == 0 & df[,'exchange'] == 'NYSE LLC'),]
    
    fit <- lm(fm, df_noZeros)
    
    # Clusters by Symbol and Exchange
    vcov <- formatC(round(sqrt(diag(cluster.vcov(fit, ~ symbol_num + exchange_num))), digits), format = "f", digits)
    
    R2 <- formatC(round(run.regression(df_noZeros, model = as.formula(paste0(y," ~ 0 +", x)), output = "R2", var = "", x = x, y = y), digits), format = "f", digits = digits)
    
    coef <- formatC(round(fit$coef,digits),format = "f", digits)
    
    if(i == 1){
      output <- data.frame(specification = specif, coef = coef, se = vcov, obs = nobs(fit), R2 = R2)
      rownames(output) <- i
    }else{
      temp <- data.frame(specification = specif, coef = coef, se = vcov, obs = nobs(fit), R2 = R2)
      rownames(temp) <- i
      output <- rbind(output, temp)
    }
    
  }
  
  return(output)
}



####################### setting up project path and downloading data  ####################
filepath <- "path-to-data-appendix/3_Analysis_of_TAQ_data"
figurePath <- paste0(filepath,"/Final_Output/Figures/")
tablePath <- paste0(filepath,"/Final_Output/Tables/")

## importing symbol universe/characteristics
data_symStats <- read.csv(paste0(filepath, "WRDS_Server/Output/Symbol_Universe_2015.csv"))
data_symStats <- data_symStats[c("SYM_ROOT","SYM_SUFFIX","LISTED_MARKET","f_Sample_1",
                                 "f_Sample_1_T100","ADV_S_Standard","n_Price_Standard", "f_NYSE_Listed")]


#############################################################################
#Daily   
#############################################################################

## importing depth volume share data
data_depth <- read.csv(paste0(filepath,"WRDS_Server/Output/DepthVolumeShare.csv"))
data_depth = data.table(data_depth)

# Top 8 only
data_depth <- data_depth[data_depth$isMain8==1,]
data_depth$exchange <- mapvalues(data_depth$EX, from = exchange_code, to = exchange_name_st)
data_depth$DATE <- as.Date(as.character(data_depth$DATE), format = "%Y-%m-%d")

mydata <- merge(data_depth, data_symStats, by = c("SYM_ROOT","SYM_SUFFIX"))

# Get variables combining symbol, exchange, month
mydata$symbolEx <- paste(as.character(mydata$symbol), as.character(mydata$EX), sep = "_")
mydata$month <- format(mydata$DATE, "%b")
mydata$exMonth <- paste(as.character(mydata$exchange), as.character(mydata$month), sep = "_")
mydata$month_num <- as.numeric(format(mydata$DATE, "%m"))
mydata$symbol_num <- mapvalues(mydata$symbol, from = as.character(unique(mydata$symbol)), to = seq(1,length(unique(mydata$symbol))))
mydata$exchange_num <- mapvalues(mydata$EX, from = as.character(unique(mydata$EX)), to = seq(1,length(unique(mydata$EX))))
mydata$exMonth_num <- mapvalues(mydata$exMonth, from = as.character(unique(mydata$exMonth)), to = seq(1,length(unique(mydata$exMonth))))
mydata$exSymbol_num <- mapvalues(mydata$symbolEx, from = as.character(unique(mydata$symbolEx)), to = seq(1,length(unique(mydata$symbolEx))))

## exchange labels
exchange_nameList <- c("NYSE MKT","NASDAQ BX","EDGA","EDGX","CHX","NYSE LLC","NYSE Arca","NASDAQ LLC","NASDAQ PSX", "BATS Y", "BATS")
exchange_nameList_main8 <- c("NASDAQ BX","EDGA","EDGX","NYSE LLC","NYSE Arca","NASDAQ LLC","BATS Y","BATS")
exchange_nameList_mainMaker <- c("EDGX","NYSE LLC","NYSE Arca","NASDAQ LLC","BATS")

######### Output Regression information
##
out_allCuts <- output.regressions_vf_perSpec(list_df = list(mydata[mydata$isMain8 == 1,], mydata[mydata$isMainMaker == 1,]),
                                             list_xVars = list("depthShare_Main8","depthShare_mainMaker"),
                                             list_yVars = list("volumeShare_Main8","volumeShare_mainMaker"),
                                             list_specifications = list("5 Main Makers + 3 Takers", "All Symbols,  5 Main Maker-Takers"))

out_allCuts_top100 <- output.regressions_vf_perSpec(list_df = list(mydata[mydata$f_Sample_1_T100 == 1 & mydata$isMain8 == 1,], mydata[mydata$f_Sample_1_T100 == 1 & mydata$isMainMaker == 1,]),
                                                    list_xVars = list("depthShare_Main8","depthShare_mainMaker"),
                                                    list_yVars = list("volumeShare_Main8","volumeShare_mainMaker"),
                                                    list_specifications = list("Top 100 Symbols,  5 Main Makers + 3 Takers", "Top 100 Symbols,  5 Main Maker-Takers"))

out_vf <- rbind(out_allCuts, out_allCuts_top100)
write.csv(out_vf, paste(tablePath, "depth_vol_reg_output.csv", sep = "/"))

######### Plot figures for Daily
mydata = data.frame(mydata)
df_label_mainMaker <- data.frame(label = exchange_nameList_mainMaker,
                                    x = c(               0.25, 0.78, 0.39, 0.55,        0.25),  
                                    y = c(               0.61, 0.64,  0.07, 0.75,        0.03))  
plot.all_manual(mydata[mydata$f_Sample_1_T100 == 1, ], df_label_mainMaker, figurePath = paste0(figurePath, "depthVolumeShare_top100_main5.png"),
                x = "depthShare_mainMaker", y = "volumeShare_mainMaker", groupBy = "exchange", xlab = "Depth Share", ylab = "Volume Share", lim = c(-0.05,1),
                exToExclude = c('B', 'Y', 'J'), axisSize = 31, exLabelSize = 7)

## Only 8 Main exchanges (5 main makers + 3 takers) - top 100 only
df_label_main8 <- data.frame(label = exchange_nameList_main8,
                            x = c(0.08, -0.045, 0.25, 0.78, 0.39, 0.55, -0.01, 0.25),
                            y = c(-0.03, 0.1 , 0.61, 0.64,  0.07, 0.75, 0.33, 0.03))
plot.all_manual(mydata[mydata$f_Sample_1_T100 == 1, ], df_label_main8, figurePath = paste0(figurePath, "depthVolumeShare_top100_main8.png"),
               x = "depthShare_Main8", y = "volumeShare_Main8", groupBy = "exchange", xlab = "Depth Share", ylab = "Volume Share", title = "", lim = c(-0.05,1),
               axisSize = 31, exLabelSize = 7)

# ## Only 5 Main exchanges (5 main makers + 3 takers) w/o recalculating the shares - top 100 only
# df_label_main8_wjust5 <- data.frame(label = exchange_nameList_mainMaker,
#                                    x = c(               0.25, 0.78, 0.39, 0.55,        0.25),
#                                    y = c(               0.61, 0.64,  0.07, 0.75,        0.03))
# plot.all_manual(mydata[mydata$f_Sample_1_T100 == 1, ], df_label_main8_wjust5, figurePath = paste0(figurePath, "depthVolumeShare_top100_main5_sharesReCalcWMain8.png"),
#                x = "depthShare_Main8", y = "volumeShare_Main8", groupBy = "exchange", xlab = "Depth Share", ylab = "Volume Share", title = "", lim = c(-0.05,1),
#                exToExclude = c('B', 'Y', 'J'), axisSize = 31, exLabelSize = 7)


#############################################################################
# 1-hour  
#############################################################################
# 
# # Note this data hasn't currently been run. 
# # To run this code, re-run the depth volume code at the appropriate interval.
# for(interval in c('1hour', '5min')){
#   data_depth <- fread(paste0(filepath, "Data/Subsample_DepthVolume", interval, ".csv"), 
#                       colClasses = list(character = 4), stringsAsFactors = TRUE)
#   data_depth = data.frame(data_depth)
#   
#   data_depth <- data_depth[!data_depth$EX %in% c("D","I","W"),]
#   data_depth$exchange <- mapvalues(data_depth$EX, from = exchange_code, to = exchange_name_st)
#   data_depth$DATE <- as.Date(as.character(data_depth$DATE), format = "%Y-%m-%d")
#   
#   mydata <- merge(data_depth, data_symStats, by = c("SYM_ROOT","SYM_SUFFIX"))
#   
#   mydata$symbolEx <- paste(as.character(mydata$symbol), as.character(mydata$EX), sep = "_")
#   mydata$month <- format(mydata$DATE, "%b")
#   mydata$exMonth <- paste(as.character(mydata$exchange), as.character(mydata$month), sep = "_")
#   mydata$month_num <- as.numeric(format(mydata$DATE, "%m"))
#   mydata$symbol_num <- mapvalues(mydata$symbol, from = as.character(unique(mydata$symbol)), to = seq(1,length(unique(mydata$symbol))))
#   mydata$exchange_num <- mapvalues(mydata$EX, from = as.character(unique(mydata$EX)), to = seq(1,length(unique(mydata$EX))))
#   mydata$exMonth_num <- mapvalues(mydata$exMonth, from = as.character(unique(mydata$exMonth)), to = seq(1,length(unique(mydata$exMonth))))
#   
#   ## exchange labels
#   exchange_nameList <- c("NYSE MKT","NASDAQ BX","EDGA","EDGX","CHX","NYSE LLC","NYSE Arca","NASDAQ LLC","NASDAQ PSX", "BATS Y", "BATS")
#   exchange_nameList_main8 <- c("NASDAQ BX","EDGA","EDGX","NYSE LLC","NYSE Arca","NASDAQ LLC","BATS Y","BATS")
#   exchange_nameList_mainMaker <- c("EDGX","NYSE LLC","NYSE Arca","NASDAQ LLC","BATS")
#   
#   output1 = rbind(output1, reg.output(data_values = mydata, noconst = FALSE, interval = interval))
#   output2 = rbind(output2, reg.output(data_values = mydata , noconst = TRUE, interval = interval))
#   
#   ######### Plot figures for hourly, top 100
#   df_label_mainMaker <- data.frame(label = exchange_nameList_mainMaker,
#                                    x = c(               0.25, 0.78, 0.39, 0.55,        0.25),  
#                                    y = c(               0.61, 0.64,  0.07, 0.75,        0.03))  
#   plot.all_manual(mydata[mydata$f_Sample_1_T100 == 1, ], df_label_mainMaker, 
#                   figurePath = paste0(figurePath, "subsample_depthVolumeShare_top100_main5", interval, ".png", sep = ''),
#                   x = "depthShare_mainMaker", y = "volumeShare_mainMaker", groupBy = "exchange", xlab = "Depth Share", ylab = "Volume Share", title = "1 hr Intervals", lim = c(-0.05,1),
#                   exToExclude = c('B', 'Y', 'J'), axisSize = 31, exLabelSize = 7)
#   
#   ## Only 8 Main exchanges (5 main makers + 3 takers) - top 100 only
#   df_label_main8 <- data.frame(label = exchange_nameList_main8,
#                                x = c(0.08, -0.047, -0.047, 0.78, 0.5, 0.55, -0.05, 0.30),
#                                y = c(-0.03, 0.1 , 0.61, 0.9,  -0.03, 0.93, 0.33, -0.03))
#   plot.all_manual(mydata[mydata$f_Sample_1_T100 == 1 , ], df_label_main8, 
#                   figurePath = paste0(figurePath, "depthVolumeShare_top100_main8ND", interval, ".png", sep = ''),
#                   x = "depthShare_Main8", y = "volumeShare_Main8", groupBy = "exchange", xlab = "Depth Share", ylab = "Volume Share", title = "", lim = c(-0.05,1),
#                   axisSize = 31, exLabelSize = 7)
#   
#   ## Only 5 Main exchanges (5 main makers + 3 takers) w/o recalculating the shares - top 100 only
#   df_label_main8_wjust5 <- data.frame(label = exchange_nameList_mainMaker,
#                                       x = c(               -0.047, 0.78, 0.5, 0.55,        0.30),
#                                       y = c(              0.61, 0.9,  -0.03, 0.93,       -0.03))
#   plot.all_manual(mydata[mydata$f_Sample_1_T100 == 1, ], df_label_main8_wjust5, 
#                   figurePath = paste0(figurePath, "depthVolumeShare_top100_main5_sharesReCalcWMain8ND", interval, ".png", sep = ''),
#                   x = "depthShare_Main8", y = "volumeShare_Main8", groupBy = "exchange", xlab = "Depth Share", 
#                   ylab = "Volume Share", title = "", lim = c(-0.05,1),
#                   exToExclude = c('B', 'Y', 'J'), axisSize = 31, exLabelSize = 7)
#   
# }

# Note that more symbols appear in the depth than in the volume. 
# These symbols are not in the top 100 for our sample, so I don't worry about them.
# These are places where there was a bid/ask but no regular volume trades were listed except
# on FINRA

